data_crime = read.table("citycrime.txt", header = TRUE)
Sapply, apply, vapply and lapply
## Murder Rape Robbery Assault Burglary Larceny
## 20.88056 70.04583 637.56250 831.63056 1715.89167 4706.79722
## MVT
## 1377.92500
## Murder Rape Robbery Assault Burglary Larceny
## 20.88056 70.04583 637.56250 831.63056 1715.89167 4706.79722
## MVT
## 1377.92500
## Murder Rape Robbery Assault Burglary Larceny
## 20.88056 70.04583 637.56250 831.63056 1715.89167 4706.79722
## MVT
## 1377.92500
## Murder Rape Robbery Assault Burglary Larceny
## 20.88056 70.04583 637.56250 831.63056 1715.89167 4706.79722
## MVT
## 1377.92500
## $Murder
## [1] 20.88056
##
## $Rape
## [1] 70.04583
##
## $Robbery
## [1] 637.5625
##
## $Assault
## [1] 831.6306
##
## $Burglary
## [1] 1715.892
##
## $Larceny
## [1] 4706.797
##
## $MVT
## [1] 1377.925
Descriptive statistics
summary_stats = function(x) {
return(c(mean = mean(x),
sd = sd(x),
max = max(x),
min = min(x),
med = median(x)))
}
crime_stats = lapply(data_crime, summary_stats)
Cities with highest crime rates
cities = rownames(data_crime)
max_city = function(x) {
cities[which.min(x)]
}
sapply(data_crime, max_city)
## Murder Rape Robbery Assault
## "Honolulu" "Santa-Ana" "Honolulu" "Virginia-Beach"
## Burglary Larceny MVT
## "San-Jose" "San-Jose" "Virginia-Beach"
Parallel
#library(parallel)
#wait = function() {
# Sys.sleep(3)
# }
# no_cores <- detectCores() - 1
# cl <- makeCluster(no_cores)
#
# parLapply(cl, 1:no_cores, wait())
#
# stopCluster(cl)
Map function - weighted average
cost_per_city = replicate(7, runif(length(cities)), simplify = FALSE)
Map(weighted.mean, data_crime, cost_per_city)
## $Murder
## [1] 20.3482
##
## $Rape
## [1] 68.09087
##
## $Robbery
## [1] 610.0875
##
## $Assault
## [1] 805.6186
##
## $Burglary
## [1] 1682.386
##
## $Larceny
## [1] 4779.669
##
## $MVT
## [1] 1411.044
Reduce and filter
top10_cities = lapply(data_crime, function(x) cities[order(x, decreasing = T)[1:10]])
intersect(intersect(intersect(top10_cities$Murder, top10_cities$Rape), top10_cities$Robbery), top10_cities$Assault) #... and so on
## [1] "Atlanta"
Reduce(intersect, top10_cities)
## [1] "Atlanta"
Filter(function(x) "Detroit" %in% x, top10_cities)
## $Murder
## [1] "New-Orleans" "Washingt" "St-Louis" "Detroit" "Burmingham"
## [6] "Atlanta" "Baltimore" "Oakland" "Newark" "Chicago"
##
## $Rape
## [1] "Minneapolis" "Cleveland" "Oklahoma" "Kansas-City" "Memphis"
## [6] "Detroit" "Toledo" "Columbus" "Cicinnati" "Atlanta"
##
## $Robbery
## [1] "Newark" "St-Louis" "Miami" "Baltimore" "Atlanta"
## [6] "Detroit" "Chicago" "Tampa" "Washingt" "Oakland"
##
## $MVT
## [1] "Tampa" "Fresno" "Newark" "Detroit" "Miami"
## [6] "St-Louis" "Sacramento" "Atlanta" "Portland" "Boston"
Position() and Find()
plot(x = data_crime$Robbery, y = data_crime$Assault)
Quick plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
qplot(x = Robbery, y = Assault, data = data_crime)
ggplot standard functions
crime_plot = ggplot(data_crime, aes(x = Robbery, y = Assault))
crime_plot + geom_point()
crime_plot + geom_point(aes(color = Murder, size = Larceny))
crime_plot + geom_point(aes(color = Murder)) +
scale_colour_gradient(high = "red", low = "blue") + theme_bw()
crime_plot + geom_text(aes(label=rownames(data_crime)), check_overlap = TRUE)
3d plot
#install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- plot_ly(data_crime, x = ~Robbery, y = ~Assault, z = ~Murder)
p
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
pairs
library(GGally)
## Warning: package 'GGally' was built under R version 3.2.5
## Warning: replacing previous import by 'utils::capture.output' when loading
## 'GGally'
## Warning: replacing previous import by 'utils::head' when loading 'GGally'
## Warning: replacing previous import by 'utils::installed.packages' when
## loading 'GGally'
## Warning: replacing previous import by 'utils::str' when loading 'GGally'
crime_pairs = ggpairs(data_crime,axisLabels = "none",
upper = list(continuous = "points", combo = "dot"),
lower = list(continuous = "cor", combo = "dot"),
diag = list(continuous = "densityDiag")) +
theme_bw()
crime_pairs
pdf("crime_ggpairs.pdf")
crime_pairs
dev.off()
## quartz_off_screen
## 2
Work in pairs to solve the next questions. In all problems, try to use functional programming when possible, and ggplot.
Simulate data from a multivariate normal distribution wih mean 0 and covariance diag(\(3^2, 2^2,1,1,\ldots\)) using p=20 variables and n=100 observations. Compute the eigenvalues of the sample covariance and plot them.
Create a list containing different versions of the simulated data varying the number of observations on each one, from 100 to 2000. For each element in the list, compute the SVD and calculate the running time. Store the results on an array. Plot the running times as a function of the number of observations. (You can use proc.time()).
Repeat the previous problem, but using the eigendecomposition of the covariance matrix.
Repeat 2 and 3, but instead of varying the number of observations, change the number of variables from 100 to 2000.
sample = matrix(rnorm(1000*10), ncol = 10)
p = 20
n = 100
X = sweep(matrix(rnorm(n*p),ncol = p), 2, c(3,2,rep(1,p-2)), "*")
eigenvalues = eigen(cov(X))$values
qplot(y=eigenvalues, xlab="Index", ylab="Eigenvalues")
generate_data = function(n, p) {
X <- sweep(matrix(rnorm(n*p),ncol = p), 2, c(3,2,rep(1,p-2)), "*")
}
compute_svd = function(X) {
start = proc.time()
X = scale(X, center = TRUE, scale = FALSE)
eigenvalues = svd(X)$d^2/(nrow(X)-1)
end = proc.time()
return(end[3] - start[3])
}
data_obs = lapply(seq(100, 5000, 100), generate_data, p = 100)
svd_times = sapply(data_obs, compute_svd)
qplot(y = svd_times, main = "SVD", xlab = "Observations", ylab = "Running time")
compute_eig = function(X) {
start = proc.time()
eigenvalues = eigen(cov(X))$values
end = proc.time()
return(end[3] - start[3])
}
eig_times = sapply(data_obs, compute_eig)
qplot(y = eig_times, main = "Eig", xlab = "Observations", ylab = "Running time")
data_var = lapply(seq(100, 2000, 100), generate_data, n = 100)
svd_times = sapply(data_var, compute_svd)
qplot(y = svd_times, main = "SVD", xlab = "Variables", ylab = "Running time")
eig_times = sapply(data_var, compute_eig)
qplot(y = eig_times, main = "Eig", xlab = "Variables", ylab = "Running time")